home *** CD-ROM | disk | FTP | other *** search
Text File | 1992-04-05 | 1.4 KB | 45 lines | [TEXT/KEEN] |
- #$RoughIndexer: towards generating a list of words to appear
- #in an index - skips common words (see $WordFrequency), prints
- #each word followed by all line locations. If you de-comment the
- #sorting lines that begin with '#%', and comment out the for-loop
- #at the beginning of the END block
- # (both of these in the END block), this will also sort the words
- #before printing. This is just a test-jig for trying out notions
- #about automating the task of index generation - ask anyone and
- #they'll tell you it's impossible. Now that's a challenge.
-
- #This simple first version generates about 10 times too many
- #raw entries. Your mission, should you choose to accept it, is
- #to make it smart enough to be useful. If you succeed, you get
- #to be famous.
-
- BEGIN {OFS = "\t"
- commonfile = STDPATH "Drag_on Modules:hAWK programs:" "common words"
- while (getline < commonfile > 0)
- {
- for ( k = 1; k <= NF; k++)
- common[$k] = 1
- }
- close(commonfile)
- $0 = ""
- }
-
- { gsub(/[^A-Za-z0-9$'-]/, " ")
- for ( k = 1; k <= NF; k++)
- {
- if (length($k) > 1 && !($k in common))
- indexLines[$k] = indexLines[$k] "\t" FNR
- }
- }
- END {
- for (w in indexLines) #comment out if sorting
- print w, indexLines[w] # comment out if sorting
- #de-comment following lines for sorting
- ##%for (w in indexLines)
- ##%linear[++m] = w "\t\t" indexLines[w]
- ##%sort(linear, ind, "d")
- ##%for (j = 1; j <= m; ++j)
- ##%print linear[ind[j]]
- }
-
-